runZinb = F
runClus = F
NCORES = 2

TL;DR

This document is a temptative of workflow with the following steps

  • Fit zinbwave with K = 50, X including the batches,
  • Get W which should capture the biology,
  • Use clusterExperiment on W,
  • Get the cluster labels,
  • Use slingshot on W from zinbwave and with cluster labels from clusterExperiment,
  • Get lineages,
  • Fit again zinbwave but with X including batches and lineages from slingshot,
  • Get DE genes between lineages.

Along the worflow, use deviance residuals as adjusted “counts”.

Unnormalized data

I am still using the data Davide gave me a while ago. And I think now it is not the same data Davide is using in its most recent vignette (that is zinb_clustering_over_k.Rmd). It should not be very important but it would be nice if we have a common dataset we can work on.

load("../data/Expt4c2b_filtdata.Rda")
load("../data/E4c2b_slingshot_wsforkelly.RData")

Here we only look at the 1000 most variable genes.

names(batch) <- colnames(counts)

counts <- counts[,names(clus.labels)]
batch <- droplevels(batch[names(clus.labels)])
qc <- qc[names(clus.labels),]

vars <- rowVars(log1p(counts))
names(vars) <- rownames(counts)
vars <- sort(vars, decreasing = TRUE)
core <- counts[names(vars)[1:1000],]

We have 616 cells.

dim(core)
## [1] 1000  616
core[1:3, 1:3]
##        OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1             8763            7221            3581
## Cbr2              5799            3638            1448
## Cyp2f2            2158            2027            1078

Cells have been processed in 18 different batches

table(batch)
## batch
## GBC08A GBC08B GBC09A GBC09B    P01    P02   P03A   P03B    P04    P05 
##     36     33     30     17     25     43     37     33     14     20 
##    P06    P10    P11    P12    P13    P14    Y01    Y04 
##     39     36     44     40     47     39     51     32

Cells have been clustered into 13 different clusters

table(clus.labels)
## clus.labels
##  1  2  3  4  5  7  8  9 10 11 12 14 15 
## 91 25 56 40 96 60 28 79 26 22 35 26 32

Batches are kind of confounded with the biology

table(data.frame(batch = as.vector(batch),
                 cluster = clus.labels))
##         cluster
## batch     1  2  3  4  5  7  8  9 10 11 12 14 15
##   GBC08A  0  2 12  9  0  0  0  0  0  2  0  2  9
##   GBC08B  0  7  5  3  0  0  0  1  2  4  0  5  6
##   GBC09A  0  1  5  9  0  0  0  1  1  0  0  6  7
##   GBC09B  0  2  2  7  0  0  0  3  0  0  0  3  0
##   P01     0  2  4  3 15  1  0  0  0  0  0  0  0
##   P02     2  0  9  3 15  3  3  2  3  0  2  1  0
##   P03A    3  0  3  0 12  2  9  4  2  0  2  0  0
##   P03B    1  2  1  1 11  1  2 10  1  1  2  0  0
##   P04     0  0  0  0 11  1  0  1  1  0  0  0  0
##   P05     0  0  0  1 11  3  0  1  0  2  2  0  0
##   P06     1  2  3  0  8  2  4  8  4  1  2  2  2
##   P10     3  1  4  0  4  5  9  2  0  2  5  0  1
##   P11     2  1  1  0  1  5  1 22  3  1  6  0  1
##   P12     0  2  0  0  4 10  0  8  2  3  6  4  1
##   P13     1  2  4  0  4 15  0  4  5  6  1  3  2
##   P14     0  0  1  2  0 12  0 12  2  0  7  0  3
##   Y01    47  1  1  2  0  0  0  0  0  0  0  0  0
##   Y04    31  0  1  0  0  0  0  0  0  0  0  0  0
compute.naive.residuals <- function(Y, zinb){
  mu_hat = t(getMu(zinb)) 
  pi_hat = t(getPi(zinb)) 
  Y_hat = (1 - pi_hat) * mu_hat
  Y - Y_hat 
}

compute.pearson.residuals <- function(Y, zinb){
  num = compute.naive.residuals(Y, zinb)
  mu = t(getMu(zinb)) 
  pi = t(getPi(zinb)) 
  phi = matrix(rep(getPhi(zinb), ncol(Y)), ncol = ncol(Y))
  var_hat = (1 - pi) * mu * (1 + mu * (phi + pi))
  num / sqrt(var_hat)  
}

compute.zinb.loglik <- function(Y, zinb){
  mu = t(getMu(zinb)) 
  theta = getTheta(zinb)
  theta_mat = matrix(rep(theta, ncol(Y), ncol = ncol(Y)))
  pi = t(getPi(zinb))
  log( pi * (Y == 0) + (1 - pi) * dnbinom(Y, size = theta, mu = mu) )
}

compute.deviance.residuals <- function(Y, zinb){
  mu_hat = t(getMu(zinb)) 
  pi_hat = t(getPi(zinb)) 
  Y_hat = (1 - pi_hat) * mu_hat
  ll = compute.zinb.loglik(Y, zinb)
  sign = 1*(Y - Y_hat > 0)
  sign[sign == 0] = -1
  sign * sqrt(-2 * ll)
}

0. Residuals for visualization

Fit zinbwave X intercept, no batch

Let’s run zinbwave with K = 0 and X batches.

fn = '../data/zinb_k0_batch.rda'
if (runZinb){
  mod = model.matrix( ~ batch)
  print(system.time(zinb0 <- zinbFit(core, K = 0, X = mod, ncores = NCORES)))
  save(zinb0, file = fn)
}else{
  load(fn)
}

We use deviance residuals for visualization. Let’s check that deviance residuals look ok.

res = compute.deviance.residuals(core, zinb0)
res[1:3,1:3]
##        OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1         4.637339        4.591452       -4.440574
## Cbr2          4.552445       -4.441892       -4.248677
## Cyp2f2        4.327763        4.310824       -4.159757

PCA on the residuals where color are for batches on the left and previously found clusters on the right.

pca = prcomp(t(res))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
     main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
     main = "PCA of residuals\ncolor=cluster")

par(mfrow = c(1,1))

Boxplot of the residuals for each cell.

res_order = res[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
boxplot(res_order, main='Boxplot of residuals\ncolor=batch', col = col_order,         staplewex = 0, outline = 0, border = col_order, xaxt = 'n')

Deviance residuals seem ok.

1. zinbwave with raw counts

Let’s run zinbwave with K = 50 and X including batches.

fn = '../data/zinb_k50_batch.rda'
if (runZinb){
  mod = model.matrix( ~ batch)
  print(system.time(zinb50 <- zinbFit(core, ncores = NCORES, K = 50,
                                      X = mod)))
  save(zinb50, file = fn)
}else{
  load(fn)
}

2. clusterExperiment with W

I use the code of Davide from zinb_clutering_over_k.Rmd

## matching the parameters used by Russell
W = zinb50@W
fn = '../data/clustExp_W_k50.rda'
if(runClus) {
  cl_res <- clusterMany(t(W), ks = 4:15, alphas = c(0.1), betas = 0.8,
                        clusterFunction = "hierarchical01", minSizes=1,
                        sequential = TRUE, subsample = TRUE, ncores = 7,
                        subsampleArgs = list(resamp.num=100,
                                             clusterFunction="kmeans",
                                             clusterArgs=list(nstart=10)),
                        seqArgs = list(k.min=3, top.can=5), verbose=TRUE)
  save(cl_res, file = fn)
} else {
  load(fn)
}
combined <- combineMany(cl_res, proportion = 0.7)
## Note: no clusters specified to combine, using results from clusterMany
## Warning in .makeColors(clusters, colors = bigPalette): too many clusters to
## have unique color assignments
plotClusters(combined, colPalette = c(bigPalette, rainbow(100)))
plotCoClustering(combined)
## Warning in .makeColors(clusters, colors = bigPalette): too many clusters to
## have unique color assignments

table(primaryClusterNamed(combined))
## 
##  -1  c1  c2  c3  c4  c5  c6  c7  c8  c9 
## 142  92   5  92  74  12 108  46  30  15
sum(primaryCluster(combined) == -1)
## [1] 142
plot(getW(zinb50), col=clus.labels, main = 'color = previous')

plot(getW(zinb50), col=as.factor(primaryCluster(combined)), main = 'color = new')

table(data.frame(previous = clus.labels, new = primaryCluster(combined)))
##         new
## previous -1  1  2  3  4  5  6  7  8  9
##       1  40 48  2  0  1  0  0  0  0  0
##       2   2  0  0 23  0  0  0  0  0  0
##       3   1  2  0 51  1  1  0  0  0  0
##       4   4  0  0  0 21  0  0  0  0 15
##       5  42 42  3  0  1  8  0  0  0  0
##       7  10  0  0  0 49  1  0  0  0  0
##       8  28  0  0  0  0  0  0  0  0  0
##       9   4  0  0  1  1  0 73  0  0  0
##       10  1  0  0  0  0  0  0 25  0  0
##       11  5  0  0 15  0  2  0  0  0  0
##       12  0  0  0  0  0  0 35  0  0  0
##       14  4  0  0  1  0  0  0 21  0  0
##       15  1  0  0  1  0  0  0  0 30  0
combined <- makeDendrogram(combined)
plotDendrogram(combined)

merged <- mergeClusters(combined, mergeMethod = "locfdr", cutoff = 0.01)
## Note: Merging will be done on ' combineMany ', with clustering index 1
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted rates numerically 0 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted rates numerically 0 occurred

## Warning: glm.fit: fitted rates numerically 0 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted rates numerically 0 occurred
## Warning in locfdr::locfdr(tstats, plot = 0): CM estimation failed, middle
## of histogram non-normal
## Warning in .makeColors(clusters, colors = bigPalette): too many clusters to
## have unique color assignments

Just to see what it looks like, let’s look at the heatmap on the W (number of cells x K=50). Now, we would like to plot a heatmap using the residuals (number of cells x number of genes). When I call plotHeatmap using argument visualizeData = residuals_matrix, it does not work and says that if give separate visualizeData, must be of same dimensions as assay(data).

plotHeatmap(merged, clusterSamplesData="dendrogramValue", breaks = .99)

So, let’s look at a heatmap on deviance residuals using heatmap.2 from gplots package with all the 1000 most variable genes.

palDF = merged@clusterLegend[[1]]
pal = palDF[, 'color']
names(pal) = palDF[, 'name']

heatmap.2(res, Colv = merged@dendro_samples,
          scale = 'none', trace = 'none', 
          col = colorRampPalette(c('blue', 'yellow'))(51),
          ColSideColors = pal[primaryClusterNamed(merged)],
          labCol = '', main = 'Deviance Residuals, 1000 most variable genes')
legend("bottom", legend = names(pal), fill = pal, title = 'Clusters',
       horiz = T, cex = 0.5)

3. slingshot with W and cluster labels

I used the unsupervised mode on W got from fitting zinbwave with batches in X and K = 4.

fn = '../data/zinb_k4_batch.rda'
if (runZinb){
  cl_labs = primaryCluster(merged)
  batch_red = as.vector(batch)
  batch_red = batch_red[cl_labs != -1]
  cl_labs = cl_labs[cl_labs != -1]
  mod = model.matrix( ~ batch_red)
  zinb4 = zinbFit(core[,primaryCluster(merged) != -1],
                  X = mod, K = 4, ncores = NCORES)
  save(zinb4, file = fn)
}else{
  cl_labs = primaryCluster(merged)
  batch_red = as.vector(batch)
  batch_red = batch_red[cl_labs != -1]
  cl_labs = cl_labs[cl_labs != -1]
  mod = model.matrix( ~ batch_red)
  load(fn)
}
X <- zinb4@W

lineages <- get_lineages(X, clus.labels = cl_labs, start.clus = 1)
curves <- get_curves(X, clus.labels = cl_labs, lineages = lineages)
plot_curves(X, cl_labs, curves, col.clus = unique(cl_labs))

print(lineages$lineage1)
## [1] "1" "5" "3" "7" "6"
print(lineages$lineage2)
## [1] "1" "5" "3" "8"
print(lineages$lineage3)
## [1] "1" "5" "4" "9"
print(lineages$lineage4)
## [1] "1" "2"

4. zinbwave with lineages

Which clusters would it make sense to test one versus the other? Would it be interesting to do two by two comparison between clusters next to each other in the different lineages?

5. DE genes with betas

DE analysis using ZINB-Wave has not been implemented yet and we are still thinking about it. Would it make sense just to test the betas between clusters making the assumption that the genes are independent? Below is just a visualization of what we could have at the end of the worflow: a heatmap of the deviance residuals for DE genes between clusters.

de = read.csv('../data/oe_markers.txt', stringsAsFactors = F, header = F)
de = de$V1

breaks = setBreaks(res[rownames(res) %in% de, ], 0.99)
heatmap.2(res[rownames(res) %in% de, ], breaks = breaks,
          Colv = merged@dendro_samples,
          scale = 'none', trace = 'none', 
          col = colorRampPalette(c('blue', 'yellow'))(51),
          ColSideColors = pal[primaryClusterNamed(merged)],
          labCol = '', main = 'Deviance Residuals, DE genes (Russell)')
legend("bottom", legend = names(pal), fill = pal, title = 'Clusters',
       horiz = T, cex = 0.5)

Session Info

sessionInfo()
## R version 3.4.0 (2017-04-21)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.4
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] clusterExperiment_1.2.0    rARPACK_0.11-0            
##  [3] slingshot_0.0.3-5          princurve_1.1-12          
##  [5] digest_0.6.12              RColorBrewer_1.1-2        
##  [7] scone_1.0.0                Rtsne_0.13                
##  [9] magrittr_1.5               gplots_3.0.1              
## [11] ggplot2_2.2.1              zinbwave_0.1.4            
## [13] scRNAseq_1.2.0             SummarizedExperiment_1.6.1
## [15] DelayedArray_0.2.4         matrixStats_0.52.2        
## [17] Biobase_2.36.2             GenomicRanges_1.28.3      
## [19] GenomeInfoDb_1.12.0        IRanges_2.10.2            
## [21] S4Vectors_0.14.2           BiocGenerics_0.22.0       
## 
## loaded via a namespace (and not attached):
##   [1] shinydashboard_0.6.0     R.utils_2.5.0           
##   [3] RSQLite_1.1-2            AnnotationDbi_1.38.0    
##   [5] htmlwidgets_0.8          grid_3.4.0              
##   [7] trimcluster_0.1-2        BiocParallel_1.10.1     
##   [9] RNeXML_2.0.7             DESeq_1.28.0            
##  [11] munsell_0.4.3            codetools_0.2-15        
##  [13] statmod_1.4.29           scran_1.4.4             
##  [15] DT_0.2                   miniUI_0.1.1            
##  [17] colorspace_1.3-2         energy_1.7-0            
##  [19] uuid_0.1-2               knitr_1.16              
##  [21] pspline_1.0-17           robustbase_0.92-7       
##  [23] bayesm_3.0-2             NMF_0.20.6              
##  [25] tximport_1.4.0           GenomeInfoDbData_0.99.0 
##  [27] hwriter_1.3.2            rhdf5_2.20.0            
##  [29] rprojroot_1.2            EDASeq_2.10.0           
##  [31] diptest_0.75-7           R6_2.2.1                
##  [33] doParallel_1.0.10        ggbeeswarm_0.5.3        
##  [35] taxize_0.8.4             locfit_1.5-9.1          
##  [37] flexmix_2.3-14           reshape_0.8.6           
##  [39] bitops_1.0-6             assertthat_0.2.0        
##  [41] scales_0.4.1             nnet_7.3-12             
##  [43] phylobase_0.8.4          beeswarm_0.2.3          
##  [45] gtable_0.2.0             RUVSeq_1.10.0           
##  [47] bold_0.4.0               rlang_0.1.1             
##  [49] genefilter_1.58.1        splines_3.4.0           
##  [51] rtracklayer_1.36.3       lazyeval_0.2.0          
##  [53] hexbin_1.27.1            rgl_0.98.1              
##  [55] yaml_2.1.14              reshape2_1.4.2          
##  [57] abind_1.4-5              GenomicFeatures_1.28.0  
##  [59] backports_1.1.0          httpuv_1.3.3            
##  [61] tensorA_0.36             tools_3.4.0             
##  [63] gridBase_0.4-7           dynamicTreeCut_1.63-1   
##  [65] stabledist_0.7-1         Rcpp_0.12.11            
##  [67] plyr_1.8.4               progress_1.1.2          
##  [69] visNetwork_1.0.3         zlibbioc_1.22.0         
##  [71] purrr_0.2.2.2            RCurl_1.95-4.8          
##  [73] prettyunits_1.0.2        viridis_0.4.0           
##  [75] zoo_1.8-0                cluster_2.0.6           
##  [77] data.table_1.10.4        RSpectra_0.12-0         
##  [79] mvtnorm_1.0-6            whisker_0.3-2           
##  [81] gsl_1.9-10.3             aroma.light_3.6.0       
##  [83] mime_0.5                 evaluate_0.10           
##  [85] xtable_1.8-2             XML_3.98-1.7            
##  [87] mclust_5.3               gridExtra_2.2.1         
##  [89] compiler_3.4.0           biomaRt_2.32.0          
##  [91] scater_1.4.0             tibble_1.3.1            
##  [93] KernSmooth_2.23-15       R.oo_1.21.0             
##  [95] htmltools_0.3.6          segmented_0.5-2.0       
##  [97] pcaPP_1.9-61             tidyr_0.6.3             
##  [99] geneplotter_1.54.0       howmany_0.3-1           
## [101] DBI_0.6-1                MASS_7.3-47             
## [103] fpc_2.1-10               MAST_1.2.1              
## [105] boot_1.3-19              compositions_1.40-1     
## [107] ade4_1.7-6               ShortRead_1.34.0        
## [109] Matrix_1.2-10            R.methodsS3_1.7.1       
## [111] gdata_2.17.0             igraph_1.0.1            
## [113] rncl_0.8.2               GenomicAlignments_1.12.1
## [115] registry_0.3             locfdr_1.1-8            
## [117] numDeriv_2016.8-1        plotly_4.6.0            
## [119] xml2_1.1.1               foreach_1.4.3           
## [121] annotate_1.54.0          vipor_0.4.5             
## [123] rngtools_1.2.4           pkgmaker_0.22           
## [125] XVector_0.16.0           stringr_1.2.0           
## [127] copula_0.999-16          ADGofTest_0.3           
## [129] softImpute_1.4           Biostrings_2.44.0       
## [131] rmarkdown_1.5            dendextend_1.5.2        
## [133] edgeR_3.18.1             kernlab_0.9-25          
## [135] shiny_1.0.3              Rsamtools_1.28.0        
## [137] gtools_3.5.0             modeltools_0.2-21       
## [139] rjson_0.2.15             nlme_3.1-131            
## [141] jsonlite_1.4             viridisLite_0.2.0       
## [143] limma_3.32.2             lattice_0.20-35         
## [145] httr_1.2.1               DEoptimR_1.0-8          
## [147] survival_2.41-3          FNN_1.1                 
## [149] prabclus_2.2-6           iterators_1.0.8         
## [151] glmnet_2.0-10            class_7.3-14            
## [153] stringi_1.1.5            mixtools_1.1.0          
## [155] latticeExtra_0.6-28      caTools_1.17.1          
## [157] memoise_1.1.0            dplyr_0.5.0             
## [159] ape_4.1